{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import time\n",
    "import re\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import warnings;warnings.filterwarnings('ignore')\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from sklearn.svm import SVC\n",
    "from sklearn.naive_bayes import GaussianNB,MultinomialNB\n",
    "from sklearn.model_selection import GridSearchCV\n",
    "from sklearn.preprocessing import LabelEncoder"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_train = pd.read_csv('train.csv',lineterminator='\\n')\n",
    "df_test = pd.read_csv('test.csv',lineterminator='\\n')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>ID</th>\n",
       "      <th>review</th>\n",
       "      <th>label</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>Jo bhi ap se tou behtar hoon</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>ya Allah meri sister Affia ki madad farma</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>Yeh khud chahta a is umar main shadi krna.  ha...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>Tc ? Apky mun xe exe alfax achy nae lgty 😒💃</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>Good</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   ID                                             review  label\n",
       "0   1                       Jo bhi ap se tou behtar hoon      0\n",
       "1   2          ya Allah meri sister Affia ki madad farma      1\n",
       "2   3  Yeh khud chahta a is umar main shadi krna.  ha...      0\n",
       "3   4        Tc ? Apky mun xe exe alfax achy nae lgty 😒💃      0\n",
       "4   5                                               Good      1"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_train['label'] = df_train['label'].map({'Negative':0,'Positive':1})\n",
    "df_train.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "ID        0\n",
       "review    0\n",
       "label     0\n",
       "dtype: int64"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_train.isnull().sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "ID        0\n",
       "review    0\n",
       "label     0\n",
       "dtype: int64"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_train.isnull().sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1    3361\n",
       "0    2967\n",
       "Name: label, dtype: int64"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_train['label'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[1, 'Jo bhi ap se tou behtar hoon', 0],\n",
       "       [2, 'ya Allah meri sister Affia ki madad farma', 1],\n",
       "       [3, 'Yeh khud chahta a is umar main shadi krna.  had ogi', 0],\n",
       "       [4, 'Tc ? Apky mun xe exe alfax achy nae lgty 😒💃', 0]],\n",
       "      dtype=object)"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "numpy_array = df_train.as_matrix()\n",
    "numpy_array_test = df_test.as_matrix()\n",
    "numpy_array[:4]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([116, 'Kisi Roz Jo Meri Gali Se Ishq Ka Janaza Nikle '],\n",
       "      dtype=object)"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "numpy_array_test[115]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "#two commom ways to clean data\n",
    "def cleaner(word):\n",
    "  word = re.sub(r'\\#\\.', '', word)\n",
    "  word = re.sub(r'\\n', '', word)\n",
    "  word = re.sub(r',', '', word)\n",
    "  word = re.sub(r'\\-', ' ', word)\n",
    "  word = re.sub(r'\\.', '', word)\n",
    "  word = re.sub(r'\\\\', ' ', word)\n",
    "  word = re.sub(r'\\\\x\\.+', '', word)\n",
    "  word = re.sub(r'\\d', '', word)\n",
    "  word = re.sub(r'^_.', '', word)\n",
    "  word = re.sub(r'_', ' ', word)\n",
    "  word = re.sub(r'^ ', '', word)\n",
    "  word = re.sub(r' $', '', word)\n",
    "  word = re.sub(r'\\?', '', word)\n",
    "  word = re.sub(r'é', '', word)\n",
    "  word = re.sub(r'§', '', word)\n",
    "  word = re.sub(r'¦', '', word)\n",
    "  word = re.sub(r'æ', '', word)\n",
    "  word = re.sub(r'\\d+', '', word)\n",
    "  word = re.sub('(.*?)\\d+(.*?)', '', word)\n",
    "  return word.lower()\n",
    "def hashing(word):\n",
    "  word = re.sub(r'ain$', r'ein', word)\n",
    "  word = re.sub(r'ai', r'ae', word)\n",
    "  word = re.sub(r'ay$', r'e', word)\n",
    "  word = re.sub(r'ey$', r'e', word)\n",
    "  word = re.sub(r'ie$', r'y', word)\n",
    "  word = re.sub(r'^es', r'is', word)\n",
    "  word = re.sub(r'a+', r'a', word)\n",
    "  word = re.sub(r'j+', r'j', word)\n",
    "  word = re.sub(r'd+', r'd', word)\n",
    "  word = re.sub(r'u', r'o', word)\n",
    "  word = re.sub(r'o+', r'o', word)\n",
    "  word = re.sub(r'ee+', r'i', word)\n",
    "  if not re.match(r'ar', word):\n",
    "    word = re.sub(r'ar', r'r', word)\n",
    "  word = re.sub(r'iy+', r'i', word)\n",
    "  word = re.sub(r'ih+', r'eh', word)\n",
    "  word = re.sub(r's+', r's', word)\n",
    "  if re.search(r'[rst]y', 'word') and word[-1] != 'y':\n",
    "    word = re.sub(r'y', r'i', word)\n",
    "  if re.search(r'[bcdefghijklmnopqrtuvwxyz]i', word):\n",
    "    word = re.sub(r'i$', r'y', word)\n",
    "  if re.search(r'[acefghijlmnoqrstuvwxyz]h', word):\n",
    "    word = re.sub(r'h', '', word)\n",
    "  word = re.sub(r'k', r'q', word)\n",
    "  return word\n",
    "\n",
    "def array_cleaner(array):\n",
    "  # X = array\n",
    "  X = []\n",
    "  for sentence in array:\n",
    "    clean_sentence = ''\n",
    "    words = sentence.split(' ')\n",
    "    for word in words:\n",
    "      clean_sentence = clean_sentence +' '+ cleaner(word)\n",
    "    X.append(clean_sentence)\n",
    "  return X"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array(['Yaqoob Memon Ki Phansi Zalimana Ghair Insani Hai 20 Saal Qaid Kaat Chukay Thay Amnesty International Ki Bharti Iqdam Ki Muzammat ',\n",
       "       ' Sabit qadam rehna',\n",
       "       'Good decision on ko bhi aam shehryun ki tarah Huqooq hony chahye',\n",
       "       ..., 'Lanat haramdyo', 'Is episode mein koi bhi hot nai hai',\n",
       "       'Mery liye dua kry Allah pak naseeb achy kry ameen'], dtype=object)"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X_test = numpy_array_test[:,1]\n",
    "X_test"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "#test if there are nan \n",
    "counter = 1\n",
    "for sentence in X_test:\n",
    "    try:\n",
    "        words = sentence.split(' ')\n",
    "        counter+=1\n",
    "    except:\n",
    "        print(sentence)\n",
    "        print(counter)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[' jo bhi ap se tou behtar hoon',\n",
       " ' ya allah meri sister affia ki madad farma',\n",
       " ' yeh khud chahta a is umar main shadi krna  had ogi',\n",
       " ' tc  apky mun xe exe alfax achy nae lgty 😒💃',\n",
       " ' good']"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X_train = numpy_array[:, 1]\n",
    "# Clean X here\n",
    "X_train = array_cleaner(X_train)\n",
    "X_test = array_cleaner(X_test)\n",
    "y_train = numpy_array[:, 2]\n",
    "X_train[:5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "6328\n",
      "2712\n"
     ]
    }
   ],
   "source": [
    "print(len(X_train))\n",
    "print(len(X_test))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(6328,)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "array([0, 1, 0, 0, 1, 0], dtype=int8)"
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "y_train = np.array(y_train)\n",
    "y_train = y_train.astype('int8')\n",
    "print(y_train.shape)\n",
    "y_train[:6]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([0, 1])"
      ]
     },
     "execution_count": 41,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test1 = pd.Series(y_train)\n",
    "test1.unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "ngram = 2\n",
    "vectorizer = TfidfVectorizer(sublinear_tf=True,ngram_range=(1, ngram), max_df=0.5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_all = X_train + X_test # Combine both to fit the TFIDF vectorization.\n",
    "lentrain = len(X_train)\n",
    "\n",
    "vectorizer.fit(X_all)\n",
    "X_all = vectorizer.transform(X_all)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['賭easar ul', '鄭h', '鄭h isnan', '鄭pwa', '鄭pwa yani']"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "vectorizer.get_feature_names()[-5:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(9040, 113521)"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X_all.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train_chuli = X_all[:lentrain] # Separate back into training and test sets. \n",
    "X_test_chuli = X_all[lentrain:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(6328, 113521)"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X_train_chuli.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "#bayesian optimization to find hyperparameter for lightgbm\n",
    "import lightgbm as lgb\n",
    "from sklearn.model_selection import KFold,StratifiedKFold\n",
    "from bayes_opt import BayesianOptimization\n",
    "import lightgbm as lgb\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn import metrics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {},
   "outputs": [],
   "source": [
    "def LGB_CV(\n",
    "          min_data_in_leaf,\n",
    "          feature_fraction,\n",
    "          bagging_fraction,\n",
    "         ):\n",
    "    \n",
    "    folds = KFold(n_splits=5, shuffle=True, random_state=2019)\n",
    "    oof = np.zeros(X_train_chuli.shape[0])\n",
    "\n",
    "    for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train_chuli, y_train)):\n",
    "        print(\"fold n°{}\".format(fold_))\n",
    "        trn_data = lgb.Dataset(X_train_chuli[trn_idx],\n",
    "                               label=y_train[trn_idx],\n",
    "                               )\n",
    "        val_data = lgb.Dataset(X_train_chuli[val_idx],\n",
    "                               label=y_train[val_idx],\n",
    "                               )\n",
    "    \n",
    "        param = {\n",
    "            'max_depth': -1,\n",
    "            'min_data_in_leaf': int(min_data_in_leaf), \n",
    "            'objective':'binary',\n",
    "            'bagging_fraction':bagging_fraction,\n",
    "            'feature_fraction':feature_fraction,\n",
    "            'learning_rate': 0.005,\n",
    "            \"boosting\": \"gbdt\",\n",
    "            \"bagging_freq\": 5,\n",
    "            \"bagging_seed\": 11,\n",
    "            \"metric\": 'auc',\n",
    "            \"verbosity\": -1\n",
    "        }\n",
    "    \n",
    "        clf = lgb.train(param,\n",
    "                        trn_data,\n",
    "                        8000,\n",
    "                        valid_sets = [trn_data, val_data],\n",
    "                        verbose_eval=500,\n",
    "                        early_stopping_rounds = 500)\n",
    "        \n",
    "        oof[val_idx] = clf.predict(X_train_chuli[val_idx],\n",
    "                                   num_iteration=clf.best_iteration)\n",
    "        \n",
    "        del clf, trn_idx, val_idx\n",
    "        \n",
    "    return metrics.roc_auc_score(y_train,oof)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {},
   "outputs": [],
   "source": [
    "LGB_BO = BayesianOptimization(LGB_CV, {\n",
    "        'min_data_in_leaf': (2, 40),\n",
    "        'bagging_fraction': (0.01, 0.999),\n",
    "        'feature_fraction':(0.01, 0.999)\n",
    "    })"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "|   iter    |  target   | baggin... | featur... | min_da... |\n",
      "-------------------------------------------------------------\n",
      "fold n°0\n",
      "Training until validation scores don't improve for 500 rounds.\n",
      "[500]\ttraining's auc: 0.8078\tvalid_1's auc: 0.749847\n",
      "[1000]\ttraining's auc: 0.841791\tvalid_1's auc: 0.752634\n",
      "[1500]\ttraining's auc: 0.867406\tvalid_1's auc: 0.752817\n",
      "[2000]\ttraining's auc: 0.889175\tvalid_1's auc: 0.754049\n",
      "Early stopping, best iteration is:\n",
      "[1902]\ttraining's auc: 0.88553\tvalid_1's auc: 0.755229\n",
      "fold n°1\n",
      "Training until validation scores don't improve for 500 rounds.\n",
      "[500]\ttraining's auc: 0.809128\tvalid_1's auc: 0.733232\n",
      "Early stopping, best iteration is:\n",
      "[460]\ttraining's auc: 0.804646\tvalid_1's auc: 0.7365\n",
      "fold n°2\n",
      "Training until validation scores don't improve for 500 rounds.\n",
      "[500]\ttraining's auc: 0.813148\tvalid_1's auc: 0.716077\n",
      "[1000]\ttraining's auc: 0.847292\tvalid_1's auc: 0.722404\n",
      "[1500]\ttraining's auc: 0.872096\tvalid_1's auc: 0.724389\n",
      "Early stopping, best iteration is:\n",
      "[1154]\ttraining's auc: 0.855722\tvalid_1's auc: 0.724973\n",
      "fold n°3\n",
      "Training until validation scores don't improve for 500 rounds.\n",
      "[500]\ttraining's auc: 0.807152\tvalid_1's auc: 0.742258\n",
      "[1000]\ttraining's auc: 0.840743\tvalid_1's auc: 0.74932\n",
      "[1500]\ttraining's auc: 0.866546\tvalid_1's auc: 0.748771\n",
      "Early stopping, best iteration is:\n",
      "[1209]\ttraining's auc: 0.852048\tvalid_1's auc: 0.751023\n",
      "fold n°4\n",
      "Training until validation scores don't improve for 500 rounds.\n",
      "[500]\ttraining's auc: 0.809607\tvalid_1's auc: 0.746684\n",
      "[1000]\ttraining's auc: 0.84407\tvalid_1's auc: 0.751119\n",
      "[1500]\ttraining's auc: 0.87007\tvalid_1's auc: 0.754485\n",
      "[2000]\ttraining's auc: 0.889589\tvalid_1's auc: 0.754716\n",
      "Early stopping, best iteration is:\n",
      "[1733]\ttraining's auc: 0.88043\tvalid_1's auc: 0.756213\n",
      "| \u001b[0m 1       \u001b[0m | \u001b[0m 0.743   \u001b[0m | \u001b[0m 0.6048  \u001b[0m | \u001b[0m 0.9983  \u001b[0m | \u001b[0m 37.57   \u001b[0m |\n",
      "fold n°0\n",
      "Training until validation scores don't improve for 500 rounds.\n",
      "[500]\ttraining's auc: 0.637416\tvalid_1's auc: 0.64061\n",
      "[1000]\ttraining's auc: 0.644168\tvalid_1's auc: 0.643619\n",
      "Early stopping, best iteration is:\n",
      "[750]\ttraining's auc: 0.644447\tvalid_1's auc: 0.65114\n",
      "fold n°1\n",
      "Training until validation scores don't improve for 500 rounds.\n",
      "[500]\ttraining's auc: 0.643459\tvalid_1's auc: 0.61601\n",
      "Early stopping, best iteration is:\n",
      "[18]\ttraining's auc: 0.619306\tvalid_1's auc: 0.622282\n",
      "fold n°2\n",
      "Training until validation scores don't improve for 500 rounds.\n",
      "[500]\ttraining's auc: 0.644515\tvalid_1's auc: 0.606414\n",
      "[1000]\ttraining's auc: 0.656289\tvalid_1's auc: 0.614959\n",
      "[1500]\ttraining's auc: 0.659348\tvalid_1's auc: 0.613489\n",
      "Early stopping, best iteration is:\n",
      "[1015]\ttraining's auc: 0.655175\tvalid_1's auc: 0.618329\n",
      "fold n°3\n",
      "Training until validation scores don't improve for 500 rounds.\n",
      "[500]\ttraining's auc: 0.639572\tvalid_1's auc: 0.615546\n",
      "Early stopping, best iteration is:\n",
      "[2]\ttraining's auc: 0.628403\tvalid_1's auc: 0.635105\n",
      "fold n°4\n",
      "Training until validation scores don't improve for 500 rounds.\n",
      "[500]\ttraining's auc: 0.638198\tvalid_1's auc: 0.643859\n",
      "[1000]\ttraining's auc: 0.653377\tvalid_1's auc: 0.657253\n",
      "[1500]\ttraining's auc: 0.658781\tvalid_1's auc: 0.656369\n",
      "[2000]\ttraining's auc: 0.661776\tvalid_1's auc: 0.652962\n",
      "Early stopping, best iteration is:\n",
      "[1536]\ttraining's auc: 0.659793\tvalid_1's auc: 0.659008\n",
      "| \u001b[0m 2       \u001b[0m | \u001b[0m 0.6191  \u001b[0m | \u001b[0m 0.06231 \u001b[0m | \u001b[0m 0.5005  \u001b[0m | \u001b[0m 28.4    \u001b[0m |\n",
      "fold n°0\n",
      "Training until validation scores don't improve for 500 rounds.\n",
      "[500]\ttraining's auc: 0.5\tvalid_1's auc: 0.5\n",
      "Early stopping, best iteration is:\n",
      "[1]\ttraining's auc: 0.5\tvalid_1's auc: 0.5\n",
      "fold n°1\n",
      "Training until validation scores don't improve for 500 rounds.\n",
      "[500]\ttraining's auc: 0.5\tvalid_1's auc: 0.5\n",
      "Early stopping, best iteration is:\n",
      "[1]\ttraining's auc: 0.5\tvalid_1's auc: 0.5\n",
      "fold n°2\n",
      "Training until validation scores don't improve for 500 rounds.\n",
      "[500]\ttraining's auc: 0.5\tvalid_1's auc: 0.5\n",
      "Early stopping, best iteration is:\n",
      "[1]\ttraining's auc: 0.5\tvalid_1's auc: 0.5\n",
      "fold n°3\n",
      "Training until validation scores don't improve for 500 rounds.\n",
      "[500]\ttraining's auc: 0.5\tvalid_1's auc: 0.5\n",
      "Early stopping, best iteration is:\n",
      "[1]\ttraining's auc: 0.5\tvalid_1's auc: 0.5\n",
      "fold n°4\n",
      "Training until validation scores don't improve for 500 rounds.\n",
      "[500]\ttraining's auc: 0.5\tvalid_1's auc: 0.5\n",
      "Early stopping, best iteration is:\n",
      "[1]\ttraining's auc: 0.5\tvalid_1's auc: 0.5\n",
      "| \u001b[0m 3       \u001b[0m | \u001b[0m 0.485   \u001b[0m | \u001b[0m 0.01    \u001b[0m | \u001b[0m 0.01    \u001b[0m | \u001b[0m 40.0    \u001b[0m |\n",
      "fold n°0\n",
      "Training until validation scores don't improve for 500 rounds.\n",
      "[500]\ttraining's auc: 0.879041\tvalid_1's auc: 0.790074\n",
      "[1000]\ttraining's auc: 0.931967\tvalid_1's auc: 0.80286\n",
      "[1500]\ttraining's auc: 0.961189\tvalid_1's auc: 0.806492\n",
      "[2000]\ttraining's auc: 0.978269\tvalid_1's auc: 0.809039\n",
      "[2500]\ttraining's auc: 0.987688\tvalid_1's auc: 0.81074\n",
      "Early stopping, best iteration is:\n",
      "[2429]\ttraining's auc: 0.986633\tvalid_1's auc: 0.811102\n",
      "fold n°1\n",
      "Training until validation scores don't improve for 500 rounds.\n",
      "[500]\ttraining's auc: 0.88044\tvalid_1's auc: 0.782168\n",
      "[1000]\ttraining's auc: 0.932543\tvalid_1's auc: 0.798356\n",
      "[1500]\ttraining's auc: 0.96102\tvalid_1's auc: 0.803757\n",
      "[2000]\ttraining's auc: 0.977631\tvalid_1's auc: 0.806902\n",
      "[2500]\ttraining's auc: 0.987358\tvalid_1's auc: 0.808384\n",
      "[3000]\ttraining's auc: 0.993203\tvalid_1's auc: 0.808181\n",
      "Early stopping, best iteration is:\n",
      "[2611]\ttraining's auc: 0.988966\tvalid_1's auc: 0.808651\n",
      "fold n°2\n",
      "Training until validation scores don't improve for 500 rounds.\n",
      "[500]\ttraining's auc: 0.882204\tvalid_1's auc: 0.768463\n",
      "[1000]\ttraining's auc: 0.932963\tvalid_1's auc: 0.787623\n",
      "[1500]\ttraining's auc: 0.962441\tvalid_1's auc: 0.79506\n",
      "[2000]\ttraining's auc: 0.978928\tvalid_1's auc: 0.79877\n",
      "[2500]\ttraining's auc: 0.988151\tvalid_1's auc: 0.801022\n",
      "[3000]\ttraining's auc: 0.993559\tvalid_1's auc: 0.802202\n",
      "[3500]\ttraining's auc: 0.996479\tvalid_1's auc: 0.804228\n",
      "[4000]\ttraining's auc: 0.997999\tvalid_1's auc: 0.8052\n",
      "[4500]\ttraining's auc: 0.998728\tvalid_1's auc: 0.805821\n",
      "[5000]\ttraining's auc: 0.999158\tvalid_1's auc: 0.806141\n",
      "[5500]\ttraining's auc: 0.999422\tvalid_1's auc: 0.806328\n",
      "[6000]\ttraining's auc: 0.999631\tvalid_1's auc: 0.806241\n",
      "Early stopping, best iteration is:\n",
      "[5750]\ttraining's auc: 0.99954\tvalid_1's auc: 0.806538\n",
      "fold n°3\n",
      "Training until validation scores don't improve for 500 rounds.\n",
      "[500]\ttraining's auc: 0.876915\tvalid_1's auc: 0.804948\n",
      "[1000]\ttraining's auc: 0.930358\tvalid_1's auc: 0.828255\n",
      "[1500]\ttraining's auc: 0.96101\tvalid_1's auc: 0.836646\n",
      "[2000]\ttraining's auc: 0.978204\tvalid_1's auc: 0.840365\n",
      "[2500]\ttraining's auc: 0.988065\tvalid_1's auc: 0.843283\n",
      "[3000]\ttraining's auc: 0.993579\tvalid_1's auc: 0.845215\n",
      "[3500]\ttraining's auc: 0.996622\tvalid_1's auc: 0.846036\n",
      "[4000]\ttraining's auc: 0.998112\tvalid_1's auc: 0.847213\n",
      "[4500]\ttraining's auc: 0.998786\tvalid_1's auc: 0.847727\n",
      "[5000]\ttraining's auc: 0.999114\tvalid_1's auc: 0.848253\n",
      "[5500]\ttraining's auc: 0.999352\tvalid_1's auc: 0.848429\n",
      "Early stopping, best iteration is:\n",
      "[5333]\ttraining's auc: 0.999275\tvalid_1's auc: 0.84866\n",
      "fold n°4\n",
      "Training until validation scores don't improve for 500 rounds.\n",
      "[500]\ttraining's auc: 0.880664\tvalid_1's auc: 0.790349\n",
      "[1000]\ttraining's auc: 0.933523\tvalid_1's auc: 0.808412\n",
      "[1500]\ttraining's auc: 0.962895\tvalid_1's auc: 0.815352\n",
      "[2000]\ttraining's auc: 0.979368\tvalid_1's auc: 0.81805\n",
      "[2500]\ttraining's auc: 0.988533\tvalid_1's auc: 0.819682\n",
      "[3000]\ttraining's auc: 0.993795\tvalid_1's auc: 0.820812\n",
      "[3500]\ttraining's auc: 0.996578\tvalid_1's auc: 0.821165\n",
      "Early stopping, best iteration is:\n",
      "[3355]\ttraining's auc: 0.995937\tvalid_1's auc: 0.821318\n",
      "| \u001b[95m 4       \u001b[0m | \u001b[95m 0.8183  \u001b[0m | \u001b[95m 0.999   \u001b[0m | \u001b[95m 0.999   \u001b[0m | \u001b[95m 2.0     \u001b[0m |\n",
      "=============================================================\n"
     ]
    }
   ],
   "source": [
    "LGB_BO.maximize(init_points=2,n_iter=2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 83,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "fold n°0\n",
      "Training until validation scores don't improve for 500 rounds.\n",
      "[500]\ttraining's auc: 0.879041\tvalid_1's auc: 0.790074\n",
      "[1000]\ttraining's auc: 0.931967\tvalid_1's auc: 0.80286\n",
      "[1500]\ttraining's auc: 0.961189\tvalid_1's auc: 0.806492\n",
      "[2000]\ttraining's auc: 0.978269\tvalid_1's auc: 0.809039\n",
      "[2500]\ttraining's auc: 0.987688\tvalid_1's auc: 0.81074\n",
      "Early stopping, best iteration is:\n",
      "[2429]\ttraining's auc: 0.986633\tvalid_1's auc: 0.811102\n",
      "fold n°1\n",
      "Training until validation scores don't improve for 500 rounds.\n",
      "[500]\ttraining's auc: 0.88044\tvalid_1's auc: 0.782168\n",
      "[1000]\ttraining's auc: 0.932543\tvalid_1's auc: 0.798356\n",
      "[1500]\ttraining's auc: 0.96102\tvalid_1's auc: 0.803757\n",
      "[2000]\ttraining's auc: 0.977631\tvalid_1's auc: 0.806902\n",
      "[2500]\ttraining's auc: 0.987358\tvalid_1's auc: 0.808384\n",
      "[3000]\ttraining's auc: 0.993203\tvalid_1's auc: 0.808181\n",
      "Early stopping, best iteration is:\n",
      "[2611]\ttraining's auc: 0.988966\tvalid_1's auc: 0.808651\n",
      "fold n°2\n",
      "Training until validation scores don't improve for 500 rounds.\n",
      "[500]\ttraining's auc: 0.882204\tvalid_1's auc: 0.768463\n",
      "[1000]\ttraining's auc: 0.932963\tvalid_1's auc: 0.787623\n",
      "[1500]\ttraining's auc: 0.962441\tvalid_1's auc: 0.79506\n",
      "[2000]\ttraining's auc: 0.978928\tvalid_1's auc: 0.79877\n",
      "[2500]\ttraining's auc: 0.988151\tvalid_1's auc: 0.801022\n",
      "[3000]\ttraining's auc: 0.993559\tvalid_1's auc: 0.802202\n",
      "[3500]\ttraining's auc: 0.996479\tvalid_1's auc: 0.804228\n",
      "[4000]\ttraining's auc: 0.997999\tvalid_1's auc: 0.8052\n",
      "[4500]\ttraining's auc: 0.998728\tvalid_1's auc: 0.805821\n",
      "[5000]\ttraining's auc: 0.999158\tvalid_1's auc: 0.806141\n",
      "[5500]\ttraining's auc: 0.999422\tvalid_1's auc: 0.806328\n",
      "[6000]\ttraining's auc: 0.999631\tvalid_1's auc: 0.806241\n",
      "Early stopping, best iteration is:\n",
      "[5750]\ttraining's auc: 0.99954\tvalid_1's auc: 0.806538\n",
      "fold n°3\n",
      "Training until validation scores don't improve for 500 rounds.\n",
      "[500]\ttraining's auc: 0.876915\tvalid_1's auc: 0.804948\n",
      "[1000]\ttraining's auc: 0.930358\tvalid_1's auc: 0.828255\n",
      "[1500]\ttraining's auc: 0.96101\tvalid_1's auc: 0.836646\n",
      "[2000]\ttraining's auc: 0.978204\tvalid_1's auc: 0.840365\n",
      "[2500]\ttraining's auc: 0.988065\tvalid_1's auc: 0.843283\n",
      "[3000]\ttraining's auc: 0.993579\tvalid_1's auc: 0.845215\n",
      "[3500]\ttraining's auc: 0.996622\tvalid_1's auc: 0.846036\n",
      "[4000]\ttraining's auc: 0.998112\tvalid_1's auc: 0.847213\n",
      "[4500]\ttraining's auc: 0.998786\tvalid_1's auc: 0.847727\n",
      "[5000]\ttraining's auc: 0.999114\tvalid_1's auc: 0.848253\n",
      "[5500]\ttraining's auc: 0.999352\tvalid_1's auc: 0.848429\n",
      "Early stopping, best iteration is:\n",
      "[5333]\ttraining's auc: 0.999275\tvalid_1's auc: 0.84866\n",
      "fold n°4\n",
      "Training until validation scores don't improve for 500 rounds.\n",
      "[500]\ttraining's auc: 0.880664\tvalid_1's auc: 0.790349\n",
      "[1000]\ttraining's auc: 0.933523\tvalid_1's auc: 0.808412\n",
      "[1500]\ttraining's auc: 0.962895\tvalid_1's auc: 0.815352\n",
      "[2000]\ttraining's auc: 0.979368\tvalid_1's auc: 0.81805\n",
      "[2500]\ttraining's auc: 0.988533\tvalid_1's auc: 0.819682\n",
      "[3000]\ttraining's auc: 0.993795\tvalid_1's auc: 0.820812\n",
      "[3500]\ttraining's auc: 0.996578\tvalid_1's auc: 0.821165\n",
      "Early stopping, best iteration is:\n",
      "[3355]\ttraining's auc: 0.995937\tvalid_1's auc: 0.821318\n"
     ]
    }
   ],
   "source": [
    "folds = KFold(n_splits=5, shuffle=True, random_state=2019)\n",
    "oof = np.zeros(X_train_chuli.shape[0])\n",
    "predictions = np.zeros(X_test_chuli.shape[0])\n",
    "for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train_chuli, y_train)):\n",
    "    print(\"fold n°{}\".format(fold_))\n",
    "    trn_data = lgb.Dataset(X_train_chuli[trn_idx],\n",
    "                           label=y_train[trn_idx],\n",
    "                           )\n",
    "    val_data = lgb.Dataset(X_train_chuli[val_idx],\n",
    "                           label=y_train[val_idx],\n",
    "                           )\n",
    "\n",
    "    param = {\n",
    "        'max_depth': -1,\n",
    "        'min_data_in_leaf': 2, \n",
    "        'objective':'binary',\n",
    "        'bagging_fraction':0.999,\n",
    "        'feature_fraction':0.999,\n",
    "        'learning_rate': 0.005,\n",
    "        \"boosting\": \"gbdt\",\n",
    "        \"bagging_freq\": 5,\n",
    "        \"bagging_seed\": 11,\n",
    "        \"metric\": 'auc',\n",
    "        \"verbosity\": -1\n",
    "    }\n",
    "\n",
    "    clf = lgb.train(param,\n",
    "                    trn_data,\n",
    "                    8000,\n",
    "                    valid_sets = [trn_data, val_data],\n",
    "                    verbose_eval=500,\n",
    "                    early_stopping_rounds = 500)\n",
    "\n",
    "    oof[val_idx] = clf.predict(X_train_chuli[val_idx],\n",
    "                               num_iteration=clf.best_iteration)\n",
    "    predictions += clf.predict(X_test_chuli, num_iteration=clf.best_iteration) / folds.n_splits"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 84,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2712\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "array([0.22984153, 0.62412493, 0.98233711, 0.79193217])"
      ]
     },
     "execution_count": 84,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "print(len(predictions))\n",
    "predictions[:4]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 85,
   "metadata": {},
   "outputs": [],
   "source": [
    "lgb_output = pd.DataFrame({\"ID\":df_test[\"ID\"], \"Pred\":predictions})\n",
    "lgb_output.to_csv('lgb_new.csv', index = False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
